{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "64e2de80",
   "metadata": {},
   "source": [
    "# AB test\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f52ff79",
   "metadata": {},
   "source": [
    "## 0. Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6c2c62f0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-05T13:22:44.312194100Z",
     "start_time": "2024-03-05T13:22:40.794306600Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from lightautoml.addons.hypex import ABTest\n",
    "from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data\n",
    "\n",
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "\n",
    "np.random.seed(42)  # needed to create example data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2dca3eaa",
   "metadata": {},
   "source": [
    "## 1. Create or upload your dataset\n",
    "In this case we will create random dataset with known effect size  \n",
    "If you have your own dataset, go to the part 2 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7b655d2d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-05T13:23:00.204737400Z",
     "start_time": "2024-03-05T13:22:44.315225300Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>signup_month</th>\n",
       "      <th>treat</th>\n",
       "      <th>pre_spends</th>\n",
       "      <th>post_spends</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>industry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>488.00</td>\n",
       "      <td>414.44</td>\n",
       "      <td>NaN</td>\n",
       "      <td>M</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>512.50</td>\n",
       "      <td>462.22</td>\n",
       "      <td>26.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>483.00</td>\n",
       "      <td>479.44</td>\n",
       "      <td>25.00</td>\n",
       "      <td>M</td>\n",
       "      <td>Logistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>501.50</td>\n",
       "      <td>424.33</td>\n",
       "      <td>39.00</td>\n",
       "      <td>M</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>543.00</td>\n",
       "      <td>514.56</td>\n",
       "      <td>18.00</td>\n",
       "      <td>F</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>9995</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>538.50</td>\n",
       "      <td>450.44</td>\n",
       "      <td>42.00</td>\n",
       "      <td>M</td>\n",
       "      <td>Logistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>9996</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>500.50</td>\n",
       "      <td>430.89</td>\n",
       "      <td>26.00</td>\n",
       "      <td>F</td>\n",
       "      <td>Logistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>9997</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>473.00</td>\n",
       "      <td>534.11</td>\n",
       "      <td>22.00</td>\n",
       "      <td>F</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>9998</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>495.00</td>\n",
       "      <td>523.22</td>\n",
       "      <td>67.00</td>\n",
       "      <td>F</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>9999</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>508.00</td>\n",
       "      <td>475.89</td>\n",
       "      <td>38.00</td>\n",
       "      <td>F</td>\n",
       "      <td>E-commerce</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id  signup_month  treat  pre_spends  post_spends   age gender  \\\n",
       "0           0             0      0      488.00       414.44   NaN      M   \n",
       "1           1             8      1      512.50       462.22 26.00    NaN   \n",
       "2           2             7      1      483.00       479.44 25.00      M   \n",
       "3           3             0      0      501.50       424.33 39.00      M   \n",
       "4           4             1      1      543.00       514.56 18.00      F   \n",
       "...       ...           ...    ...         ...          ...   ...    ...   \n",
       "9995     9995            10      1      538.50       450.44 42.00      M   \n",
       "9996     9996             0      0      500.50       430.89 26.00      F   \n",
       "9997     9997             3      1      473.00       534.11 22.00      F   \n",
       "9998     9998             2      1      495.00       523.22 67.00      F   \n",
       "9999     9999             7      1      508.00       475.89 38.00      F   \n",
       "\n",
       "        industry  \n",
       "0     E-commerce  \n",
       "1     E-commerce  \n",
       "2      Logistics  \n",
       "3     E-commerce  \n",
       "4     E-commerce  \n",
       "...          ...  \n",
       "9995   Logistics  \n",
       "9996   Logistics  \n",
       "9997  E-commerce  \n",
       "9998  E-commerce  \n",
       "9999  E-commerce  \n",
       "\n",
       "[10000 rows x 8 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = create_test_data(num_users=10000, rs=52, na_step=10, nan_cols=['age', 'gender'])\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d87c9442",
   "metadata": {},
   "source": [
    "## 2. AB-test"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0bb6fece",
   "metadata": {},
   "source": [
    "### 2.0 Data\n",
    "Let's correct data to see how AB-test works"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6f5a8a1f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-05T13:23:00.294937300Z",
     "start_time": "2024-03-05T13:23:00.200596300Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>signup_month</th>\n",
       "      <th>treat</th>\n",
       "      <th>pre_spends</th>\n",
       "      <th>post_spends</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>industry</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>488.00</td>\n",
       "      <td>414.44</td>\n",
       "      <td>NaN</td>\n",
       "      <td>M</td>\n",
       "      <td>E-commerce</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>512.50</td>\n",
       "      <td>462.22</td>\n",
       "      <td>26.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>E-commerce</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>483.00</td>\n",
       "      <td>479.44</td>\n",
       "      <td>25.00</td>\n",
       "      <td>M</td>\n",
       "      <td>Logistics</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  signup_month  treat  pre_spends  post_spends   age gender  \\\n",
       "0        0             0      0      488.00       414.44   NaN      M   \n",
       "1        1             8      1      512.50       462.22 26.00    NaN   \n",
       "2        2             7      1      483.00       479.44 25.00      M   \n",
       "\n",
       "     industry group  \n",
       "0  E-commerce  test  \n",
       "1  E-commerce  test  \n",
       "2   Logistics  test  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ab = data.copy()\n",
    "\n",
    "half_data = int(data.shape[0] / 2)\n",
    "data_ab['group'] = ['test'] * half_data + ['control'] * half_data\n",
    "data_ab.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "690ceec5",
   "metadata": {},
   "source": [
    "### 3.1 Full AB-test\n",
    "\n",
    "Full (basic) version of test includes calculation of all available metrics, which are: \"diff in means\", \"diff in diff\" and \"cuped\"<br>\n",
    "Pay attention, that for \"cuped\" and \"diff in diff\" metrics required target before pilot."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4108a137",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-05T13:23:00.405321800Z",
     "start_time": "2024-03-05T13:23:00.232414600Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'size': {'test': 5000, 'control': 5000},\n",
       " 'difference': {'ate': 1.108044444444488,\n",
       "  'medain_diff': 0.16666666666668561,\n",
       "  'cuped': 0.897496915890514,\n",
       "  'diff_in_diff': 0.610344444444479},\n",
       " 'p-value': {'t-test': 0.15973563889393272,\n",
       "  'mann_whitney': 0.11494755666097989}}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = ABTest()\n",
    "results = model.execute(\n",
    "    data=data_ab,\n",
    "    target_field='post_spends',\n",
    "    target_field_before='pre_spends',\n",
    "    group_field='group'\n",
    ")\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea252142",
   "metadata": {},
   "source": [
    "### 2.2 Simple AB-test\n",
    "To estimate effect without target data before pilot `calc_difference_method='ate'` can be used - effect will be estimated with \"diff in means\" method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0ab77779",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-05T13:23:31.806379600Z",
     "start_time": "2024-03-05T13:23:31.442627700Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'size': {'test': 5000, 'control': 5000},\n",
       " 'difference': {'ate': 1.108044444444488},\n",
       " 'p-value': {'t-test': 0.15973563889393272,\n",
       "  'mann_whitney': 0.11494755666097989}}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = ABTest(calc_difference_method='ate')\n",
    "model.execute(data=data_ab, target_field='post_spends', group_field='group')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
